WSDM KKBox - Churn Prediction
Install and Load Packages
knitr::opts_chunk$set(echo = TRUE)
Install_And_Load <- function(packages) {
k <- packages[!(packages %in% installed.packages()[,"Package"])];
if(length(k))
{install.packages(k, repos='https://cran.rstudio.com/');}
for(package_name in packages)
{suppressMessages(library(package_name,character.only=TRUE, quietly = TRUE));}
}
Install_And_Load(c("RevoScaleR", "dplyr", "stringr",
"lubridate", "plotly",
"rmdformats"))## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'lubridate' was built under R version 3.4.4
## Warning: package 'plotly' was built under R version 3.4.4
## Warning: package 'ggplot2' was built under R version 3.4.4
rxOptions(reportProgress = 3) # reduces the amount of output RevoScaleR produces
options(dplyr.print_max = 200)
options(dplyr.width = Inf) # shows all columns of a tbl_df object
directory <- "E:/Documents/PersonalProjects/WsdmKKBox-ChurnPrediction/"
# paste0(directory,
# "Data/Processed/Rda/Plotly/",
# "genderChurn_donutchart",
# "_Plotly.rda"))
# path <- "E:/Documents/POCs/HRanalytics/OrionHRanalytics/"
loadAndPlot <- function(variable, prefix, suffix = "_Plotly.rda") {
ext <- paste0(variable, suffix)
load(paste0(directory, "Data/Processed/Rda/Plotly/", ext))
p
}Data Ingestion
Using Machine Learning Server (also known as Microsft R Server) we read in the following files - transactions, members, train and user log data. These files are saved as .xdf file on the disk and not as dataframe since they are huge.
Few data manipulations were done on the fly - Converting some columns to factor - Converting character to date type
# transactions.csv ####
csvPath <- paste0(directory, "Data/Base/transactions.csv")
transactionsXdf <- paste0(directory, "Data/Processed/Xdf/transactions.xdf")
rxImport(inData = csvPath,
outFile = transactionsXdf,
colClasses = c(payment_method_id = 'factor',
is_auto_renew = 'factor',
is_cancel = 'factor'),
transforms = list(transaction_date = as.Date(as.character(transaction_date), format="%Y%m%d"),
membership_expire_date = as.Date(as.character(membership_expire_date), format="%Y%m%d")),
overwrite = TRUE,
rowsPerRead = 5000000,
reportProgress = 3)
# transactionsXdf <- RxXdfData(xdfPath)
# rxGetInfo(transactionsXdf, getVarInfo = TRUE, numRows = 10)
# members data ####
csvPath <- paste0(directory, "Data/Base/members_v3.csv")
membersXdf <- paste0(directory, "Data/Processed/Xdf/members.xdf")
rxImport(inData = csvPath,
outFile = membersXdf,
colClasses = c(city = 'factor',
gender = 'factor',
registered_via = 'factor'),
# transforms = list(registration_init_time = as.Date(as.character(registration_init_time), format="%Y%m%d")),
overwrite = TRUE,
reportProgress = 3)
membersXdf <- RxXdfData(membersXdf)
# rxGetInfo(membersXdf, getVarInfo = TRUE, numRows = 10)
# rxSummary(~city, membersXdf)
# rxSummary(~registered_via, membersXdf)
rxFactors(inData = membersXdf, outFile = membersXdf,
factorInfo = list(city = list(varName = "city",
# levels = 1:21,
otherLevel = NULL,
sortLevels = TRUE),
registered_via = list(varName = "registered_via",
# levels = 1:19,
otherLevel = NULL,
sortLevels = TRUE)),
# sortLevels = TRUE,
overwrite = TRUE)
# rxGetInfo(membersXdf, getVarInfo = TRUE, numRows = 10)
# user logs data ####
csvPath <- paste0(directory, "Data/Base/user_logs.csv")
userLogsXdf <- paste0(directory, "Data/Processed/Xdf/userLogs.xdf")
rxImport(inData = csvPath,
outFile = userLogsXdf,
# colClasses = c(city = 'factor',
# gender = 'factor',
# registered_via = 'factor'),
transforms = list(date = as.Date(as.character(date), format="%Y%m%d")),
overwrite = TRUE)
userLogsXdf <- RxXdfData(userLogsXdf)
# rxGetInfo(userLogsXdf, getVarInfo = TRUE, numRows = 10)
# train data ####
csvPath <- paste0(directory, "Data/Base/train_v2.csv")
trainXdf <- paste0(directory, "Data/Processed/Xdf/train.xdf")
rxImport(inData = csvPath,
outFile = trainXdf,
colClasses = c(is_churn = 'factor'),
overwrite = TRUE,
reportProgress = 3)
trainXdf <<- RxXdfData(trainXdf)
# rxGetInfo(trainXdf, getVarInfo = TRUE, numRows = 10)The above code is set not to run while knitting as it might take some time. So I have to set the paths to xdf objects again.
transactionsXdf <- RxXdfData(paste0(directory, "Data/Processed/Xdf/transactions.xdf"))
membersXdf <- RxXdfData(paste0(directory, "Data/Processed/Xdf/members.xdf"))
userLogsXdf <- RxXdfData(paste0(directory, "Data/Processed/Xdf/userLogs.xdf"))
trainXdf <- RxXdfData(paste0(directory, "Data/Processed/Xdf/train.xdf"))Lets have a glimpse of these datasets
# transactions ####
rxGetInfo(transactionsXdf, getVarInfo = TRUE, numRows = 5)## File name: E:\Documents\PersonalProjects\WsdmKKBox-ChurnPrediction\Data\Processed\Xdf\transactions.xdf
## Number of observations: 2e+07
## Number of variables: 9
## Number of blocks: 4
## Compression type: zlib
## Variable information:
## Var 1: msno, Type: character
## Var 2: payment_method_id
## 40 factor levels: 41 39 21 37 40 ... 6 3 2 4 1
## Var 3: payment_plan_days, Type: integer, Low/High: (0, 450)
## Var 4: plan_list_price, Type: integer, Low/High: (0, 2000)
## Var 5: actual_amount_paid, Type: integer, Low/High: (0, 2000)
## Var 6: is_auto_renew
## 2 factor levels: 1 0
## Var 7: transaction_date, Type: Date, Low/High: (2015-01-01, 2017-02-28)
## Var 8: membership_expire_date, Type: Date, Low/High: (1970-01-01, 2017-03-31)
## Var 9: is_cancel
## 2 factor levels: 0 1
## Data (5 rows starting with row 1):
## msno payment_method_id
## 1 YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc= 41
## 2 AZtu6Wl0gPojrEQYB8Q3vBSmE2wnZ3hi1FbK1rQQ0A4= 41
## 3 UkDFI97Qb6+s2LWcijVVv4rMAsORbVDT2wNXF0aVbns= 41
## 4 M1C56ijxozNaGD0t2h68PnH2xtx5iO5iR2MVYQB6nBI= 39
## 5 yvj6zyBUaqdbUQSrKsrZ+xNDVM62knauSZJzakS9OW4= 39
## payment_plan_days plan_list_price actual_amount_paid is_auto_renew
## 1 30 129 129 1
## 2 30 149 149 1
## 3 30 129 129 1
## 4 30 149 149 1
## 5 30 149 149 1
## transaction_date membership_expire_date is_cancel
## 1 2015-09-30 2015-11-01 0
## 2 2015-09-30 2015-10-31 0
## 3 2015-09-30 2016-04-27 0
## 4 2015-09-30 2015-11-28 0
## 5 2015-09-30 2015-11-21 0
# members ####
rxGetInfo(membersXdf, getVarInfo = TRUE, numRows = 5)## File name: E:\Documents\PersonalProjects\WsdmKKBox-ChurnPrediction\Data\Processed\Xdf\members.xdf
## Number of observations: 6769473
## Number of variables: 6
## Number of blocks: 14
## Compression type: zlib
## Variable information:
## Var 1: msno, Type: character
## Var 2: city
## 21 factor levels: 1 10 11 12 13 ... 5 6 7 8 9
## Var 3: bd, Type: integer, Low/High: (-7168, 2016)
## Var 4: gender
## 2 factor levels: female male
## Var 5: registered_via
## 18 factor levels: -1 1 10 11 13 ... 5 6 7 8 9
## Var 6: registration_init_time, Type: integer, Low/High: (20040326, 20170429)
## Data (5 rows starting with row 1):
## msno city bd gender
## 1 Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8= 1 0 <NA>
## 2 +tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU= 1 0 <NA>
## 3 cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw= 1 0 <NA>
## 4 9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g= 1 0 <NA>
## 5 WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk= 6 32 female
## registered_via registration_init_time
## 1 11 20110911
## 2 7 20110914
## 3 11 20110915
## 4 11 20110915
## 5 9 20110915
# user log ####
rxGetInfo(userLogsXdf, getVarInfo = TRUE, numRows = 5)## File name: E:\Documents\PersonalProjects\WsdmKKBox-ChurnPrediction\Data\Processed\Xdf\userLogs.xdf
## Number of observations: 392106543
## Number of variables: 9
## Number of blocks: 785
## Compression type: zlib
## Variable information:
## Var 1: msno, Type: character
## Var 2: date, Type: Date, Low/High: (2015-01-01, 2017-02-28)
## Var 3: num_25, Type: integer, Low/High: (0, 18798)
## Var 4: num_50, Type: integer, Low/High: (0, 1710)
## Var 5: num_75, Type: integer, Low/High: (0, 1690)
## Var 6: num_985, Type: integer, Low/High: (0, 2747)
## Var 7: num_100, Type: integer, Low/High: (0, 42004)
## Var 8: num_unq, Type: integer, Low/High: (1, 4784)
## Var 9: total_secs, Type: numeric, Storage: float32, Low/High: (-9223372474941440.0000, 9223372474941440.0000)
## Data (5 rows starting with row 1):
## msno date num_25 num_50
## 1 rxIP2f2aN0rYNp+toI0Obt/N/FYQX8hcO1fTmmy2h34= 2015-05-13 0 0
## 2 rxIP2f2aN0rYNp+toI0Obt/N/FYQX8hcO1fTmmy2h34= 2015-07-09 9 1
## 3 yxiEWwE9VR5utpUecLxVdQ5B7NysUPfrNtGINaM2zA8= 2015-01-05 3 3
## 4 yxiEWwE9VR5utpUecLxVdQ5B7NysUPfrNtGINaM2zA8= 2015-03-06 1 0
## 5 yxiEWwE9VR5utpUecLxVdQ5B7NysUPfrNtGINaM2zA8= 2015-05-01 3 0
## num_75 num_985 num_100 num_unq total_secs
## 1 0 0 1 1 280.335
## 2 0 0 7 11 1658.948
## 3 0 0 68 36 17364.955
## 4 1 1 97 27 24667.316
## 5 0 0 38 38 9649.029
# train ####
rxGetInfo(trainXdf, getVarInfo = TRUE, numRows = 5)## File name: E:\Documents\PersonalProjects\WsdmKKBox-ChurnPrediction\Data\Processed\Xdf\train.xdf
## Number of observations: 970960
## Number of variables: 2
## Number of blocks: 2
## Compression type: zlib
## Variable information:
## Var 1: msno, Type: character
## Var 2: is_churn
## 2 factor levels: 1 0
## Data (5 rows starting with row 1):
## msno is_churn
## 1 ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg= 1
## 2 f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE= 1
## 3 zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg= 1
## 4 8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ= 1
## 5 K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA= 1
EDA
Churning Members’ Profile (Members data & train data)
We shall merge members data and train data (labelled data).
churnMembersXdf <- paste0(directory, "Data/Processed/Xdf/churnMembers.xdf")
rxMerge(inData1 = membersXdf,
inData2 = trainXdf,
outFile = churnMembersXdf,
matchVars = "msno",
type = "inner",
overwrite = TRUE)Lets have a glimpse of it,
churnMembersXdf <- RxXdfData(paste0(directory, "Data/Processed/Xdf/churnMembers.xdf"))
rxGetInfo(churnMembersXdf, getVarInfo = TRUE, numRows = 5)## File name: E:\Documents\PersonalProjects\WsdmKKBox-ChurnPrediction\Data\Processed\Xdf\churnMembers.xdf
## Number of observations: 860967
## Number of variables: 7
## Number of blocks: 2
## Compression type: zlib
## Variable information:
## Var 1: msno, Type: character
## Var 2: city
## 21 factor levels: 1 10 11 12 13 ... 5 6 7 8 9
## Var 3: bd, Type: integer, Low/High: (-3152, 2016)
## Var 4: gender
## 2 factor levels: female male
## Var 5: registered_via
## 18 factor levels: -1 1 10 11 13 ... 5 6 7 8 9
## Var 6: registration_init_time, Type: integer, Low/High: (20040326, 20170424)
## Var 7: is_churn
## 2 factor levels: 1 0
## Data (5 rows starting with row 1):
## msno city bd gender
## 1 +++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o= 1 0 <NA>
## 2 +++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw= 15 26 male
## 3 +++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc= 1 0 <NA>
## 4 ++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk= 18 21 male
## 5 ++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50= 22 32 male
## registered_via registration_init_time is_churn
## 1 7 20140608 0
## 2 9 20101118 0
## 3 7 20140927 0
## 4 7 20160315 0
## 5 9 20120122 0
rxSummary(~gender,
data = churnMembersXdf)## ReadNum=1, CurrentBlockNum=2, CurrentNumRows=484507, TotalRowsProcessed=484507, ReadTime=0.005, ProcessDataTime = 0, LoopTime = 0.005
## ReadNum=2, CurrentBlockNum=3, CurrentNumRows=376460, TotalRowsProcessed=860967, ReadTime=0.003, ProcessDataTime = 0.009, LoopTime = 0.004
## Total loop time: 0.009
## Total read time for 2 reads: 0.008
## Total process data time: 0.015
## Average read time per read: 0.004
## Average process data time per read: 0.0075
## Number of threads used: 4
## Computation time: 0.014 seconds.
## Call:
## rxSummary(formula = ~gender, data = churnMembersXdf)
##
## Summary Statistics Results for: ~gender
## Data: churnMembersXdf (RxXdfData Data Source)
## File name:
## E:/Documents/PersonalProjects/WsdmKKBox-ChurnPrediction/Data/Processed/Xdf/churnMembers.xdf
## Number of valid observations: 860967
##
##
## Category Counts for gender
## Number of categories: 2
## Number of valid observations: 388905
## Number of missing observations: 472062
##
## gender Counts
## female 184344
## male 204561
Gender
genderChurnData <- rxCube(formula = ~ gender : is_churn,
data = churnMembersXdf,
returnDataFrame = TRUE,
overwrite = TRUE)
# d <- data.frame(table(watson$Partner, watson$Churn))
churnGender <- genderChurnData[genderChurnData$is_churn == 1, ]
nonchurnGender <- genderChurnData[genderChurnData$is_churn == 0, ]
p <- plot_ly() %>%
add_pie(data = churnGender, labels = ~gender, values = ~Counts, hole = 0.6,
name = "Churn Customers", domain = list(x = c(0, 0.45), y = c(0.2, 0.8))) %>%
add_pie(data = nonchurnGender, labels = ~gender, values = ~Counts, hole = 0.6,
name = "Non-churn Customers", domain = list(x = c(0.55, 1), y = c(0.2, 0.8))) %>%
layout(title = 'Gender distribution in customer attrition',
annotations = list( list(text = "Churn customers",
font = list(size = 11),
showarrow = FALSE,
x = .14, y = .5),
list(text = "Non churn customers",
font = list(size = 11),
showarrow = FALSE,
x = .88, y = .5)),
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
save(p,
file = paste0(directory,
"Data/Processed/Rda/Plotly/",
"genderChurn_donutchart",
"_Plotly.rda"))loadAndPlot(variable = "genderChurn_donutchart")Donut charts of Churn wrt Gender, City and registered_via
multiFacVarChurnData <- rxCube(formula = ~ gender : city : registered_via : is_churn ,
data = churnMembersXdf,
returnDataFrame = TRUE,
overwrite = TRUE)
print(multiFacVarChurnData)
saveDoubleDonutChart <- function(data, varCol, fixCol ) {
# varCol <- 1
# data <- multiFacVarChurnData
varName <- colnames(data[varCol])
fixCol <- grep("is_churn", colnames(data))
varChurnData <- aggregate(x = data$Counts,
by = list(data[[varCol]] , data[[fixCol]]),
FUN = sum)
colnames(varChurnData) <- c("variable", "is_churn", "count")
churnVar <- varChurnData[varChurnData$is_churn == 1, ]
nonchurnVar <- varChurnData[varChurnData$is_churn == 0, ]
p <- plot_ly() %>%
add_pie(data = churnVar, labels = ~variable, values = ~count, hole = 0.6,
name = "Churn Customers", domain = list(x = c(0, 0.45), y = c(0.2, 0.8))) %>%
add_pie(data = nonchurnVar, labels = ~variable, values = ~count, hole = 0.6,
name = "Non-churn Customers", domain = list(x = c(0.55, 1), y = c(0.2, 0.8))) %>%
layout(title = paste(varName, 'distribution in customer attrition'),
annotations = list( list(text = "Churn customers",
font = list(size = 11),
showarrow = FALSE,
x = .14, y = .5),
list(text = "Non churn customers",
font = list(size = 11),
showarrow = FALSE,
x = .88, y = .5)),
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
save(p,
file = paste0(directory,
"Data/Processed/Rda/Plotly/",
varName,
"Churn_donutchart",
"_Plotly.rda"))
}
# saveDoubleDonutChart(data = multiFacVarChurnData,
# varCol = 1)
lapply(1:3,
FUN = function(x) saveDoubleDonutChart(data = multiFacVarChurnData, varCol = x))loadAndPlot(variable = "genderChurn_donutchart")loadAndPlot(variable = "cityChurn_donutchart")loadAndPlot(variable = "registered_viaChurn_donutchart")Age distribuion of churners
rxHistogram(~ F(bd, 10, 100)|is_churn, data = churnMembersXdf)## ReadNum=1, CurrentBlockNum=2, CurrentNumRows=484507, TotalRowsProcessed=484507, ReadTime=0.008, ProcessDataTime = 0, LoopTime = 0.008
## ReadNum=2, CurrentBlockNum=3, CurrentNumRows=376460, TotalRowsProcessed=860967, ReadTime=0.006, ProcessDataTime = 0.009, LoopTime = 0.01
## Total loop time: 0.018
## Total read time for 2 reads: 0.014
## Total process data time: 0.018
## Average read time per read: 0.007
## Average process data time per read: 0.009
## Number of threads used: 4
## Computation time: 0.027 seconds.
Pattern of registration of churning members
registChurnData <- rxCube(formula = ~ F(registration_init_time): is_churn,
data = churnMembersXdf,
returnDataFrame = TRUE,
overwrite = TRUE)
# summary(registChurnData)
registChurnData$F_registration_init_time <- as.Date(as.character(registChurnData$F_registration_init_time), format="%Y%m%d")
levels(registChurnData$is_churn) <- c('Churn', 'Non-churn')
p <- plot_ly(registChurnData,
x = ~F_registration_init_time,
y = ~Counts,
color = ~is_churn,
mode = 'lines',
type = 'scatter' ) %>%
layout(title = "Distribution of churning subscribers over the registration time",
xaxis = list(title = "Registration Date"),
yaxis = list(title = "No of Subscribers"))
save(p,
file = paste0(directory,
"Data/Processed/Rda/Plotly/",
"churnRegistSubscribers",
"_Plotly.rda"))Visualising it, we get
loadAndPlot("churnRegistSubscribers")